In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# BAGGING, BOOSTING, and RANDOM FORESTS
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install ISLP;
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt;
from ISLP import load_data
# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')
In [12]:
print(Auto.columns)
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin'], dtype='object')
In [10]:
# X = Auto.drop(columns=["mpg", "name"])
X = Auto.drop(columns=["mpg"])
y = Auto["mpg"]
In [14]:
# 1. Bagging and Random Forests
# Random forest model with default settings (similar to R's default m = p/3)
rf = RandomForestRegressor()
rf.fit(X, y)
# Model summary
print(f"Mean Squared Error: {mean_squared_error(y, rf.predict(X)):.4f}")
print(f"Variance explained (%): {rf.score(X, y) * 100:.2f}")
Mean Squared Error: 1.0002 Variance explained (%): 98.35
In [16]:
# Feature importance
importance = rf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance}).sort_values(by='Importance', ascending=False)
print(importance_df)
# Plotting feature importance
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel("Importance")
plt.title("Feature Importance in Random Forest")
plt.show()
Feature Importance 1 displacement 0.375632 3 weight 0.205921 0 cylinders 0.140268 2 horsepower 0.121348 5 year 0.120379 4 acceleration 0.030020 6 origin 0.006431
In [24]:
# 2. Cross-validation
# Reset indices for X and y
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
# Randomly split data into training and validation sets
Z = np.random.choice(len(y), 200, replace=False)
# Select training and test sets based on Z
X_train, X_test = X.loc[Z], X.drop(Z)
y_train, y_test = y.loc[Z], y.drop(Z)
In [26]:
# Fit random forest on training subset
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# Calculate mean squared error
mse_cv = mean_squared_error(y_test, y_pred)
print(f"Cross-validated MSE: {mse_cv:.4f}")
Cross-validated MSE: 6.1369
In [40]:
# 3. Searching for the optimal solution
reg_tree = RandomForestRegressor(oob_score=True, n_estimators=147, max_features=3, random_state=42)
reg_tree.fit(X_train, y_train)
# Access the OOB score
print("OOB Score:", reg_tree.oob_score_)
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(30, 20))
plot_tree(reg_tree.estimators_[0], feature_names=X_train.columns, filled=True, fontsize=10)
plt.show()
OOB Score: 0.828643068370655
In [42]:
# Sample sizes for trees
n_trees = range(1, 300, 10) # You can adjust this range as needed
oob_scores = []
# Collect OOB scores for different numbers of trees
for n in n_trees:
reg_tree = RandomForestRegressor(n_estimators=n, oob_score=True, random_state=42)
reg_tree.fit(X_train, y_train)
oob_scores.append(reg_tree.oob_score_)
# Plotting OOB error
plt.figure(figsize=(10, 6))
plt.plot(n_trees, oob_scores, marker='o', linestyle='-', color='b')
plt.title('OOB Score vs. Number of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('OOB Score')
plt.grid()
plt.show()
C:\Users\baron\AppData\Local\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:615: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates. warn(
In [48]:
# Finding the optimal number of trees to minimize MSE and maximize R²
errors = reg_tree.oob_score_
optimal_trees = np.argmin(errors) + 1
print(f"Optimal number of trees: {optimal_trees}")
# Final random forest with optimal trees
rf_optimal = RandomForestRegressor(max_features=7, n_estimators=optimal_trees)
rf_optimal.fit(X, y)
print(f"Optimal Model MSE: {mean_squared_error(y, rf_optimal.predict(X)):.4f}")
print(f"Optimal Model R²: {rf_optimal.score(X, y) * 100:.2f}")
Optimal number of trees: 1 Optimal Model MSE: 5.9720 Optimal Model R²: 90.17
In [46]:
# 4. Cross-validation loop to find the best m and number of trees
cv_errors = []
n_trees = []
for m in range(1, 8):
rf_m = RandomForestRegressor(max_features=m, oob_score=True)
rf_m.fit(X_train, y_train)
mse_min = np.min(rf_m.oob_score_)
opt_trees = np.argmin(rf_m.oob_score_) + 1
cv_errors.append(mse_min)
n_trees.append(opt_trees)
# Plot CV errors
plt.figure(figsize=(10, 6))
plt.plot(range(1, 8), cv_errors, marker='o')
plt.xlabel("Number of Features (m)")
plt.ylabel("Cross-Validation Error (MSE)")
plt.title("Cross-Validation Error for Different m Values")
plt.show()
# Print results
optimal_m = np.argmin(cv_errors) + 1
print(f"Optimal m: {optimal_m}, with MSE: {cv_errors[optimal_m-1]:.4f}")
# Optimal random forest with best m and number of trees
rf_final = RandomForestRegressor(max_features=optimal_m, n_estimators=n_trees[optimal_m-1])
rf_final.fit(X, y)
print(f"Final Optimal Model MSE: {mean_squared_error(y, rf_final.predict(X)):.4f}")
print(f"Final Optimal Model R²: {rf_final.score(X, y) * 100:.2f}")
# Feature importance for the optimal model
importance_optimal = rf_final.feature_importances_
importance_df_optimal = pd.DataFrame({'Feature': features, 'Importance': importance_optimal}).sort_values(by='Importance', ascending=False)
print(importance_df_optimal)
Optimal m: 7, with MSE: 0.8105 Final Optimal Model MSE: 4.7235 Final Optimal Model R²: 92.23 Feature Importance 0 cylinders 0.651802 2 horsepower 0.130907 5 year 0.101565 3 weight 0.066039 4 acceleration 0.031475 1 displacement 0.015715 6 origin 0.002498